home *** CD-ROM | disk | FTP | other *** search
- /* WIDE AREA INFORMATION SERVER SOFTWARE:
- No guarantees or restrictions. See the readme file for the full standard
- disclaimer.
-
- Brewster@think.com
- */
-
- /*
- * Indexes the words in a text file.
- *
- * Port of irtfiles.lisp.
- *
- * -brewster 6/90
- */
-
- /* the main functions are:
- * index_text_file
- * index_directory
- *
- * Some of the policy issues coded in this file are
- * What extra weight should the headline get?
- *
- */
-
- #include <ctype.h>
- #include <string.h>
- #include "panic.h"
- #include "irdirent.h"
- #include "irhash.h"
- #include "cutil.h"
- #include "futil.h"
- #include "irfiles.h"
- #include "irtfiles.h"
-
- #ifndef THINK_C
- #include <sys/types.h>
- #include <sys/stat.h>
- #endif /* ndef THINK_C */
-
- #define MAX_LINE_LENGTH 1000 /* characters */
- #define extra_weight_for_header 10
-
- #ifdef UNIX
- #define PRINT_AS_INDEXING true /* also defined in irfiles.c */
- #else
- #define PRINT_AS_INDEXING false
- #endif
-
- char* header_flag_1;
- char* header_flag_2;
- long len_of_files_since_last_delete = 0;
- long len_of_files_since_last_flush = 0;
- long total_indexed_file_length = 0;
-
- boolean indexingForBeta = false;
-
- #ifdef NOTUSED
- #define WORD_LETTERS "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890"
-
-
- static char *new_word _AP((char* line,char* word));
-
- static char *new_word(line,word)
- char *line;
- char *word;
- {
- /* This copies the first word from line into word while downcasing it.
- It returns a pointer into line that is after the word,
- which can be used to call this function again.
- If there are no words left, then NULL is returned,
- and word is length 0.
- There has got to be a better way.
- */
- long i = 0;
- char *beginning_ptr = strpbrk(line, WORD_LETTERS);
- char *next_word;
- long length;
- if(NULL == beginning_ptr){
- word[0] = '\0';
- return(NULL);
- }
- length = strspn(beginning_ptr, WORD_LETTERS);
- next_word = length + beginning_ptr;
-
- length = MIN(MAX_WORD_LENGTH,length);
- for(i=0; i<length; i++){
- word[i] = char_downcase((unsigned long)*beginning_ptr++);
- }
- word[i] = '\0';
- return(next_word);
- }
-
- static boolean reasonable_word _AP((char* word));
-
- static boolean reasonable_word(word)
- char* word;
- /* this should be more sophisticated */
- {
- if(strlen(word) > 1){
- return(TRUE);
- }
- else{
- return(FALSE);
- }
- }
-
- #endif /* def NOTUSED */
-
- static long add_words_if_appropriate
- _AP((char* line,long document_id,long weight,long file_position_before_line,
- long* line_length,boolean* newline_terminated,database* db));
-
- static long
- add_words_if_appropriate(line,
- document_id,
- weight,
- file_position_before_line,
- line_length,
- newline_terminated,
- db)
- char* line;
- long document_id;
- long weight;
- long file_position_before_line;
- long *line_length;
- boolean *newline_terminated;
- database* db;
- {
- /* Add words to the index if it should be done.
- * Returns the number of words added.
- * Should it return the amount of weight added?
- * The line length is side effected with the length of the line.
- * Newline_terminated is set based on whether the last character
- * in the string was a newline. If it was not, then it fgets probably
- * did not retrieve the whole line.
- */
-
- long position_in_word = 0;
- long word_count = 0;
- char word[MAX_WORD_LENGTH + 1];
- unsigned long ch;
- long char_count = 0;
-
- for(ch = (unsigned char)line[char_count++];
- ch != '\0'; ch = (unsigned char)line[char_count++]){
- boolean alnum = isalnum(ch);
- if(alnum){
- /* put the character in the word if not too long */
- if(position_in_word < MAX_WORD_LENGTH){
- word[position_in_word++] = char_downcase((unsigned long)ch);
- }
- }
- else{ /* not an in a word */
- if(position_in_word != 0){
- /* then we have collected a word */
- if(position_in_word > 1){ /* is it reasonable ? */
- word[position_in_word] = '\0';
- add_word(word,
- file_position_before_line + char_count,
- 0L, /* line_pos */
- weight,
- document_id,
- (time_t)0L,
- db);
- word_count++;
- }
- position_in_word = 0;
- }
- }
- }
- /* finish last word */
- if(position_in_word > 1){ /* is it reasonable ? */
- word[position_in_word] = '\0';
- add_word(word,
- file_position_before_line + char_count,
- 0L, /* line_pos */
- weight,
- document_id,
- (time_t)0L,
- db);
- word_count++;
- }
-
- /* for debugging */
- if(char_count - 1 != strlen(line)) {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "char_count: %ld, strlen: %ld", char_count, strlen(line));
- }
- if('\n' != line[char_count-2])
- *newline_terminated = false;
- else
- *newline_terminated = true;
-
- *line_length = char_count - 1;
- return(word_count);
- }
-
- static int nodecompare _AP((unsigned long* i,unsigned long* j));
-
- static int
- nodecompare(i,j)
- unsigned long *i, *j;
- {
- if (i[0] < j[0])
- return(-1);
- else if (i[0] > j[0])
- return(1);
- else
- return(0);
- }
-
- #define nodeRange 256 /* 2048 sprint nodes on a full sized machine - should
- be passed in */
- #define iterations_to_reorder 50 /* 1 is best but slow */
-
- static void finish_document
- _AP((char* header,char* line,long document_id,
- document_table_entry* the_document_table_entry,
- long file_position_before_line,database* db));
-
- static void
- finish_document(header,line,document_id,the_document_table_entry,file_position_before_line,db)
- char* header;
- char* line;
- long document_id;
- document_table_entry* the_document_table_entry;
- long file_position_before_line;
- database* db;
- { long line_length;
- boolean newline_terminated;
- if(0 != strlen(header)){
- /* add weights for the header (if there was one) */
- the_document_table_entry->document_length +=
- add_words_if_appropriate(header, document_id,
- extra_weight_for_header,
- file_position_before_line,
- &line_length,
- &newline_terminated,
- db);
- }
-
- /* store out the document header here */
- the_document_table_entry->headline_id =
- write_headline_table_entry(header, db);
- if(NULL == line)
- { /* EOF */
- /* if it goes to the end of the file, then
- * set the end_character to 0 so that it is clear that
- * it goes to the end of the file.
- */
- the_document_table_entry->end_character = 0;
- }
- else /* set the end_character */
- the_document_table_entry->end_character = file_position_before_line;
-
-
- /*
- waislog("start char: %ld, end char: %ld",
- the_document_table_entry->start_character,
- the_document_table_entry->end_character);
- */
-
- if (indexingForBeta)
- { /* we need to decide which sprint node this doc will go in.
- for now we will store the sn in the date field, but that
- is temporary
- NOTE that we must subract 1 from document_id, since we want
- a 0 based number
- */
- static unsigned long* nodes = NULL; /* size/node# inited to 0 to 2047 */
- static long minPos;
- unsigned long size;
-
- if (nodes == NULL)
- { long i;
- long startPos;
- time_t temp_time;
-
- nodes = (unsigned long*)s_malloc(sizeof(unsigned long)*nodeRange*2);
- srand((int)time(&temp_time)); /* try to distribute the entries */
- startPos = rand() % nodeRange; /* for indexes with < nodeRng docs */
- for (i = 0; i < nodeRange; i++)
- { nodes[(i * 2) + 1] = (i + startPos) % nodeRange;
- nodes[i * 2] = 0;
- }
- minPos = 0;
- /*printf("init: ");
- for (i = 0; i < nodeRange; i++)
- printf("<%lu,%lu> ",nodes[i*2],nodes[(i*2)+1]);
- nl();*/
- }
-
- /* place the document in the emptiest node (at minPos) */
- the_document_table_entry->date = (time_t)nodes[(minPos * 2) + 1];
-
- /* increment the size to account for document */
- size = nodes[minPos * 2];
- size += (the_document_table_entry->end_character -
- the_document_table_entry->start_character);
- nodes[minPos * 2] = size;
-
- if ((the_document_table_entry->end_character -
- the_document_table_entry->start_character) > 100000)
- printf("big doc %lu %s\n",the_document_table_entry->end_character - the_document_table_entry->start_character,header);
-
- minPos++;
-
- /* possibly reorder it */
- if (minPos > iterations_to_reorder)
- {
- long i;
- minPos = 0;
- /*printf("before: ");
- for (i = 0; i < nodeRange; i++)
- printf("<%lu,%lu> ",nodes[i*2],nodes[(i*2)+1]);
- nl();*/
- qsort((char*)nodes,nodeRange,sizeof(unsigned long) * 2,nodecompare);
- /*printf("after: ");
- for (i = 0; i < nodeRange; i++)
- printf("<%lu,%lu> ",nodes[i*2],nodes[(i*2)+1]);
- nl();*/
- printf("just sorted nodes, min: ");
- for (i = 0; i < 10; i++)
- printf("%lu ",nodes[i * 2]);
- printf(", max: %lu/%lu\n",nodes[(nodeRange * 2)-2],nodes[(nodeRange * 2)-1]);
- }
-
-
-
- #ifdef old
- sn = (document_id - 1) % 2048; /* 2048 = sn's in a full machine */
-
- /* should also take into account the "fullness" of any particular
- node */
- the_document_table_entry->date = (time_t)sn;
- /* waislog(WLOG_LOW, WLOG_INFO,
- "put %s in sprint node %ld",header,sn);*/
- #endif /* def old */
- }
-
- write_document_table_entry(the_document_table_entry, db);
- cprintf(PRINT_AS_INDEXING, ".");
- total_indexed_file_length = /* set this so the speed looks right */
- total_indexed_file_length + file_position_before_line;
- total_indexed_file_length = /* set it back */
- total_indexed_file_length - file_position_before_line;
- }
-
- #define LENGTH_OF_NEWLINE 1 /* this will be 2 on a PC, I think */
-
- void index_text_file(filename,
- separator_function,
- header_function,
- date_function,
- finish_header_function,
- type,
- db,
- check_for_text_file,
- check_for_file_already_indexed)
- char* filename;
- boolfunc *separator_function;
- voidfunc *header_function;
- longfunc *date_function;
- voidfunc *finish_header_function;
- char *type;
- database* db;
- boolean check_for_text_file;
- boolean check_for_file_already_indexed;
- {
- /* Addes words to the index for a given file.
- * The function arguments can be NULL which means it would
- * always answer NULL.
- * separator_function is called on every line to see if it
- * separates documents.
- * header_function is called on every line so that a headline
- * can be accumulated. This assumes that it will side effect global
- * variables.
- * finish_header_function is called when the document is finished
- * (by separator function responding TRUE or EOF) this will return
- * the headline string or NULL.
- * Presumably finish_header_function will use the
- * effects of header_function. finish_header_function
- * will only be called once, so it should clear whatever state
- * header_function has set.
- * if check_for_text_file then it looks to see if first character
- * in the file is a printable character.
- * if check_for_file_already_indexed then it looks through the filename
- * file to see if the file has not been indexed. If it has,
- * then it is checked to see if it is up-to-date. (it does not
- * kill the old entry (maybe it should)).
- */
-
- long filename_id;
- document_table_entry the_document_table_entry;
- long document_id = next_document_id(db);
- FILE* input_stream = s_fopen(filename, "r");
- long file_position_before_line = 0;
- long date;
-
- if(NULL == input_stream){
- waislog(WLOG_HIGH, WLOG_ERROR,
- "File %s does not exist", filename);
- /* then the is not a valid file to be indexed */
- return;
- }
- if(check_for_file_already_indexed){
- time_t time;
- char full_path[MAX_FILENAME_LEN];
- truename(filename, full_path);
- if(true == filename_in_database(full_path, type, &time, db)){
- /* check that it is the same time as this file */
- if(time == file_write_date(filename)){
- waislog(WLOG_HIGH, WLOG_INDEX,
- "File %s already indexed", filename);
- s_fclose(input_stream);
- return;
- }
- }
- }
-
- if(check_for_text_file){
- /* if we need this to be a text file, check the first character
- for a printable character */
- long ch = fgetc(input_stream);
- /* printf("First character is '%c'\n", ch); */
- if(EOF == ch || (!isprint(ch) && !isspace(ch))){
- s_fclose(input_stream);
- return;
- }
- ungetc(ch, input_stream);
- }
-
- /* write out the filename */
- filename_id = write_filename_table_entry(filename, type, db);
-
- /* (if (not *drop_table*) (make_drop_table)) maybe put in later */
-
- header_flag_1 = NULL;
- the_document_table_entry.filename_id = filename_id;
- the_document_table_entry.start_character = 0;
- the_document_table_entry.document_length = 0;
- the_document_table_entry.number_of_lines = 0;
- the_document_table_entry.date = 0;
-
- while(TRUE){
- long line_length;
- boolean newline_terminated;
- char line[MAX_LINE_LENGTH];
- char header[MAX_LINE_LENGTH];
- char* read_line_result;
- boolean eof;
-
- /* printf("ftell: %ld\n", ftell(input_stream)); */
- /* read a line */
- read_line_result = fgets(line, MAX_LINE_LENGTH, input_stream);
- beFriendly();
-
- /* eof = feof(input_stream); */ /* zero means not eof */
- eof = !read_line_result;
-
- the_document_table_entry.number_of_lines++;
-
- header[0] = '\0'; /* set it to the empty string */
- if(eof ||
- ((NULL != separator_function) &&
- separator_function(line))){
-
- /* we are processing a separator, therefore we should
- * finish off the last document, and start a new one
- */
- if(NULL != finish_header_function){
- finish_header_function(header);
- }
- if(0 == strlen(header)){
- char full_path[1000];
- char directory[1000];
- truename(filename, full_path);
- sprintf(header, "%s %s", pathname_name(full_path),
- pathname_directory(full_path, directory));
- }
- the_document_table_entry.number_of_lines--; /* dont count separator */
- /* finish off the last */
- finish_document(header, line, document_id,
- &the_document_table_entry,
- eof? /* if EOF, use file length */
- file_length(input_stream):file_position_before_line,
- db);
- /* initialize the next one */
- the_document_table_entry.filename_id = filename_id;
- the_document_table_entry.start_character = file_position_before_line;
- the_document_table_entry.number_of_lines = 1; /* count separator */
- the_document_table_entry.date = 0;
-
- document_id = next_document_id(db);
-
- if(!eof)
- { /* not EOF */
- if(NULL != header_function){
- header_function(line);
- }
- line_length = strlen(line);
- newline_terminated = true;
- }
- else{ /* EOF */
- /* printf("closing the file\n"); */
- s_fclose(input_stream);
- return;
- }
- }
-
- else{
- /* not a separator or EOF so process the line */
- long number_of_words;
- if(NULL != header_function) header_function(line);
- if (date_function != NULL &&
- the_document_table_entry.date == 0 &&
- (date = date_function(line)) > 0)
- the_document_table_entry.date = date;
-
- number_of_words = add_words_if_appropriate(line, document_id, 1L,
- file_position_before_line,
- &line_length,
- &newline_terminated,
- db);
- the_document_table_entry.document_length += number_of_words;
- len_of_files_since_last_delete += number_of_words;
- len_of_files_since_last_flush += number_of_words;
- }
- if(newline_terminated)
- file_position_before_line += (line_length +
- LENGTH_OF_NEWLINE /* in case of crlf */
- - 1 /* fgets gets one newline */
- );
- else
- file_position_before_line = ftell(input_stream);
-
-
- /* for debugging
- if(file_position_before_line != ftell(input_stream)) {
- waislog(WLOG_LOW, WLOG_INFO, "ftell: %ld, computed ftell: %ld",
- ftell(input_stream),
- file_position_before_line);
- }
- */
-
- }
- }
-
-
-
-
- /* return TRUE if it is a directory, FALSE otherwise */
- boolean directoryp(file)
- char *file;
-
- {
- #ifdef THINK_C
- return(false);
- #else
- struct stat stbuf;
- if(stat(file, &stbuf) == -1)
- return(FALSE);
- if((stbuf.st_mode & S_IFMT) == S_IFDIR)
- return(true);
- return(FALSE);
- #endif
- }
-
- /* return true if it is a file, FALSE otherwise */
- boolean filep(file)
- char *file;
- {
- #ifdef THINK_C
- return(probe_file(file));
- #else
- struct stat stbuf;
- if(stat(file, &stbuf) == -1)
- return(FALSE);
- if(!((stbuf.st_mode & S_IFMT) == S_IFDIR))
- return(true);
- return(FALSE);
- #endif
- }
-
- /* recursively indexes the directory specified.
- * If it is a file, then index it.
- */
- void index_directory(file,
- separator_function,
- header_function,
- date_function,
- finish_header_function,
- type,
- db,
- check_for_text_file,
- check_for_file_already_indexed)
- char *file;
- boolfunc *separator_function;
- voidfunc *header_function;
- longfunc *date_function;
- voidfunc *finish_header_function;
- char *type;
- database* db;
- boolean check_for_text_file;
- boolean check_for_file_already_indexed;
- {
- if(filep(file)){
- waislog(WLOG_MEDIUM, WLOG_INDEX,
- "%s Indexing file: %s", printable_time(), file);
- index_text_file(file, separator_function,
- header_function,
- date_function,
- finish_header_function,
- type,
- db,
- check_for_text_file,
- check_for_file_already_indexed);
- }
- else if(directoryp(file)){
- #ifndef THINK_C
- #ifdef SYSV
- FILE *dirp = s_fopen(file, "r");
- #else
- DIR *dirp = opendir(file);
- #endif
- struct dirent *dp;
- char name[1000]; /* max filename size */
-
- #ifdef SYSV
- while (fread((char *)dp, sizeof( *dp), 1, dirp) == 1) {
- #else
- while ((dp = readdir(dirp)) != NULL) {
- #endif
- if(strcmp(dp->d_name, ".") == 0
- || strcmp(dp->d_name, "..") == 0
- )
- continue;
- strcpy(name, file); /* copy the filename into the name variable */
- strcat(name, "/");
- strcat(name, dp->d_name);
- index_directory(name, separator_function,
- header_function,
- date_function,
- finish_header_function,
- type,
- db,
- check_for_text_file,
- check_for_file_already_indexed);
- }
- #ifdef SYSV
- s_fclose(dirp);
- #else /* ndef SYSV */
- closedir(dirp);
- #endif /* ndef SYSV */
- #endif /*ndef THINK_C */
- }
- }
-
-
-
-